Reproducible analysis of Human Origins
library(knitr)
library(genio)
library(ggtree)
library(popkin)
library(BEDMatrix)
library(superadmixture)
library(tidyverse)
# for plotting
library(ggplot2)
library(grid)
library(gridGraphics)
library(ggplotify)
library(ggpubr)
library(patchwork)
library(latex2exp)1 Data Preprocessing
In this document, we demonstrate our procedures for analysis of Human Origins dataset, attributed to Lazaridis et al. (2016) and Skoglund et al. (2016). The raw data of Human Origins dataset can be found at the datasets of Reich lab:
- Human Origins present-day individuals analyzed at Lazaridis et al. (2016) is available at: https://reich.hms.harvard.edu/sites/reich.hms.harvard.edu/files/inline-files/NearEastPublic.tar.gz
- Human Origins present-day individuals analyzed at Skoglund et al. (2016) is available at: https://reich.hms.harvard.edu/sites/reich.hms.harvard.edu/files/inline-files/SkoglundEtAl2016_Pacific_FullyPublic (3).tar.gz
We use PLINK and convertf of Eigensoft for data preprocessing. The PLINK software can be downloaded from this webpage. The Eigensoft is hosted at the github, and can downloaded and installed using the commands in the README.md.
We download and unzip Human Origins data from the link listed above. We create a folder called rawdata to host the downloaded raw data and pre-processed data.
## Download the publicly-available Human Origins
mkdir -p rawdata && cd $_
BASE='https://reich.hms.harvard.edu/sites/reich.hms.harvard.edu/files/inline-files'
wget --no-check-certificate "$BASE/NearEastPublic.tar.gz"
wget --no-check-certificate "$BASE/SkoglundEtAl2016_Pacific_FullyPublic (3).tar.gz"
## Extract the files from these archives
tar -xzf "NearEastPublic.tar.gz"
tar -xzf "SkoglundEtAl2016_Pacific_FullyPublic (3).tar.gz"
rm "NearEastPublic.tar.gz"
rm "SkoglundEtAl2016_Pacific_FullyPublic (3).tar.gz"
rm AncientLazaridis2016.{geno,ind,snp}
rm HO.snp
rm HOIll.snpThe data we just downloaded is in Eigensoft’s own format. However, our pre-processing and analysis requires the PLINK BED format.
The program convertf is part of Eigensoft used for format conversion. We here use a wrapper function geno_to_bed for convertf to simplify the converting. We also add the sub-population label to the *.fam files using the third column of the *.ind file (technically “family IDs”).
CONVERTF=<PATH_TO_CONVERTF_EXECUTABLE>
function geno_to_bed {
if [ -z "$1" ]
then
# if no inputs, show usage message...
echo "Usage: geno_to_bed <file>"
echo "Converts file.{geno,snp,ind} into file.{bed,bim,fam}"
else
file=$1
# temporary "parameter" file!
file_par='par_geno_to_bed_tmp.txt'
# this is what we're writing!
# this "par" file maps inputs/outputs, and says X chromosome should have been excluded, though output has Chr 23 and 24...
cat > $file_par <<EOF
genotypename: $file.geno
snpname: $file.snp
indivname: $file.ind
outputformat: PACKEDPED
genotypeoutname: $file.bed
snpoutname: $file.bim
indivoutname: $file.fam
familynames: NO
noxdata: YES
EOF
# now run desired command!
time ./$CONVERTF -p $file_par
# remove temp file when done!
rm $file_par
fi
}
geno_to_bed "HumanOriginsPublic2068"
geno_to_bed "SkoglundEtAl2016_Pacific_FullyPublic"
join -1 2 -2 1 -o 2.3,1.2,1.3,1.4,1.5,1.6 HumanOriginsPublic2068.fam HumanOriginsPublic2068.ind > HumanOriginsPublic2068.fam.NEW
mv HumanOriginsPublic2068.fam.NEW HumanOriginsPublic2068.fam
join -1 2 -2 1 -o 2.3,1.2,1.3,1.4,1.5,1.6 SkoglundEtAl2016_Pacific_FullyPublic.fam SkoglundEtAl2016_Pacific_FullyPublic.ind |
awk -F '[ |:]' '{print $1, $3, $4, $5, $6, $7}' > SkoglundEtAl2016_Pacific_FullyPublic.fam.NEW
mv SkoglundEtAl2016_Pacific_FullyPublic.fam.NEW SkoglundEtAl2016_Pacific_FullyPublic.fam
# remove eigensoft-formatted files
rm HumanOriginsPublic2068.{geno,ind,snp}
rm SkoglundEtAl2016_Pacific_FullyPublic.{geno,ind,snp}We then merge the main Human Origins dataset with the Pacific datasets. These datasets have non-overlapping individuals that were genotyped using the same microarray platform.
PLINK=<PATH_TO_PLINK_EXECUTABLE>
./$PLINK \
--keep-allele-order \
--indiv-sort none \
--bfile HumanOriginsPublic2068 \
--bmerge SkoglundEtAl2016_Pacific_FullyPublic \
--out human_origins_and_pacific_public_preWe apply the following filters to the HumanOriginsPublic2068.{bed,bim,fam}.
- Filters for individuals. We remove a handful of individuals to simplify our figures:
- Individuals from singleton subpopulations (those with only one individual)
- Aancient individuals from Lapita_Vanuatu
- Filters for SNPs. The Pacific dataset has more stringent quality controls, so fewer loci appear in that data. We will keep only the intersection of loci.
- Simplifying the sub-population label. We replace the sub-population label
Gujarati[A-D]withGujarati.
awk -F" " '{if (NR==FNR)
freq[$1]++;
else if (freq[$1]==1 || $1=="AA" || $1=="Lapita_Vanuatu")
print $1, $2;
else next;}' human_origins_and_pacific_public_pre.fam human_origins_and_pacific_public_pre.fam > rm_fam.txt
awk -F" " '{print $2;}' SkoglundEtAl2016_Pacific_FullyPublic.bim > loci_pacific.snplist
./$PLINK \
--bfile human_origins_and_pacific_public_pre \
--extract loci_pacific.snplist \
--remove-fam rm_fam.txt \
--autosome \
--make-bed \
--maf 0.01 \
--out human_origins_and_pacific_public
awk -F" " '{
if ($1 ~ /Gujarati/) print "Gujarati",$2,$3,$4,$5,$6;
else print $1,$2,$3,$4,$5,$6;
}' human_origins_and_pacific_public.fam > human_origins_and_pacific_public.fam.NEW
mv human_origins_and_pacific_public.fam.NEW human_origins_and_pacific_public.fam
rm loci_pacific.snplist
rm rm_fam.txt2 Estimating individual-level coancestry
In the following sections, the intermediate data will be stored at the rdata folder.
X <- BEDMatrix::BEDMatrix("rawdata/human_origins_and_pacific_public", simple_names = TRUE)
fam <- genio::read_fam( "rawdata/human_origins_and_pacific_public")
info <- readr::read_tsv( "rawdata/human_origins_and_pacific_public_subpops.txt", col_types = 'ccddc')
# Reorder individuals by the sub-population order
fam <- dplyr::left_join(fam, info, by = c("fam" = "subsubpop"))
subpop_order <- c("SAfrica", "MAfrica", "NAfrica", "Europe", "Caucasus",
"MiddleEast", "SAsia", "EAsia", "NAsia", "Americas", "Oceania")
index <- order(match(fam$subpop, subpop_order))
fam <- fam[index, ]
X <- X[index, ]Here we demonstrate how to estimate the individual-level coancestry \(\boldsymbol{\Theta}\) according to the Ochoa-Storey (OS) method by popkin package. So here we only presents the codes but not running them. We provide the pre-computed coancestry at rdata/coanc_os.rda. It should noted that the popkin function returns the kinship coefficients instead of the coancestry coefficients. Therefore, we use the inbr_diag function in the popkin package to map kinship coefficients \(\phi_{jk}\)’s to coancestry coefficients \(\theta_{jk}\)’s:
\[ \theta_{jk} = \begin{cases} 2\phi_{jk} - 1 & j = k \\ \phi_{jk} &j \neq k \end{cases}. \]
obj <- popkin::popkin_A(t(X))
A_min <- popkin::popkin_A_min_subpops(obj$A, subpops = fam$fam)
kinship_os <- 1 - obj$A / A_min
coanc_os <- popkin::inbr_diag(kinship_os)
coanc_os <- ifelse(coanc_os < 0, 0, coanc_os)
save(coanc_os, file = "rdata/coanc_os.rda")
save(fam, file = "rdata/fam.rda")
save(X, file = "rdata/X.rda")We can visualize the individual-level coancestry of the simulated data using plot_popkin function in the popkin function. We use the following helper function plot_colors_subpops to label the sub-populations.
plot_colors_subpops <- function(pops, srt = 0, cex = 0.6, y = FALSE) {
n <- length(pops)
k <- unique(pops)
pops <- factor(pops, levels = unique(pops))
xintercept <- cumsum(table(pops))
breaks <- xintercept - 0.5 * as.numeric(table(pops))
if (y) {
plot(NULL, xlim = c(0, 1), ylim = c(1, n), axes = FALSE, ann = FALSE, xaxs = "i", yaxs = "i")
text(1, n-rev(breaks), rev(unique(pops)), cex = cex, srt = srt, xpd = TRUE, adj = c(1, 0.5))
} else {
plot(NULL, xlim = c(1, n), ylim = c(0, 1), axes = FALSE, ann = FALSE, xaxs = "i", yaxs = "i")
text(breaks, 1, unique(pops), cex = cex, srt = srt, xpd = TRUE, adj = c(1, 0.5))
}
}load("rdata/coanc_os.rda")
load("rdata/fam.rda")
par(mar = c(0, 0, 0, 0) + 0.2)
layout(rbind(c(3, 1, 2), c(3, 1, 5), c(0, 4, 0)), widths = c(0.2, 1, 0.2), heights = c(0.5, 0.5, 0.3))
plot_popkin(kinship = coanc_os, layout_add = FALSE, leg_cex = 0.8, labs_text = FALSE, labs_lwd = 0.1, labs = fam$subpop, ylab = '', leg_title = "Coancestry")
par(mar = c(0.2, 0, 0.2, 0))
plot_colors_subpops(fam$subpop, y = TRUE, cex = 0.8)
mtext('Individuals', side = 2, line = 0.5, xpd = NA, cex = 0.8)
par(mar = c(0, 0.2, 0, 0.2))
plot_colors_subpops(fam$subpop, srt = 90, cex = 0.8)3 Estimating admixture proportions
The following chunk of the codes estimates the admixture proportions \(\boldsymbol{Q}\) from genotypes. We first estimate the individual specific allele frequencies \(\boldsymbol{\Pi}\) using the est_p_indiv function in the superadmixture package. We then estimate \(\boldsymbol{Q}\) by decomposing \(\boldsymbol{\Pi}\) with factor_p_indiv function in the superadmixture package. It takes hours to run the following codes, so we’re not running here.
for (k_antepops in 2:15) {
# estimate individual-specific allele frequencies
obj <- est_p_indiv(X, k_antepops, loci_on_cols = TRUE)
p_indiv <- obj$p_indiv
rowspace <- obj$rowspace
# estimate admix_props by decomposing individual-specific allele frequencies
obj <- factor_p_indiv(p_indiv = p_indiv,
rowspace = rowspace,
k_antepops = k_antepops,
init_method = "rowspace",
verbose = TRUE,
max_iters = 1000,
tol = 1e-4)
# save admixture proportions
Q_hat <- obj$Q_hat
dir.create(file.path("rdata", "Q_hat"), showWarnings = FALSE)
save(Q_hat, file = paste0("rdata/Q_hat/", k_antepops, ".rda"))
}4 Selecting number of antecedent populations
The following chunk of the codes calculates the p-values of structured Hardy-Weinberg Equilibrium test (sHWE) and plots the relationship between the negative entropy of the sHWE p-values and \(K\). It takes hours to run the following codes, so we’re not running here.
for (k_antepops in 2:15) {
# estimate `rowspace`, an input for sHWE function
rowspace <- est_p_indiv(X, k_antepops, loci_on_cols = TRUE, rowspace_only = TRUE)
# perform sHWE test
pvals <- sHWE(X, k_antepops, loci_on_cols = TRUE, rowspace = rowspace)
save(pvals, file = paste0("rdata/sHWE/", k_antepops, ".rda"))
}
neg_entropy <- c()
for (k_antepops in 2:15) {
load(paste0("rdata/sHWE/", k_antepops, ".rda"))
neg_entropy <- c(neg_entropy, calc_neg_entropy(pvals))
}
save(neg_entropy, file = "rdata/neg_entropy.rda")We visualize the relationship between \(K\) and the negative entropy of sHWE p-values.
load("rdata/neg_entropy.rda")
plot(2:15, neg_entropy, ylim = c(-2.2, -1.85), xlab = "", ylab = "", type = "b", family="Times New Roman")
mtext(latex2exp::TeX(r"(\textit{$K$})"), side = 1, col = "black", line = 2.5, family="Times New Roman", pch = 10)
mtext("Negative Entropy", side = 2, line = 2.5, col = "black", family="Times New Roman")We select \(K = 11\) according to this plot.
5 Estimating the coancestry among antecedent populations
After obtaining individual-level coancestry \(\boldsymbol{\Theta}\) and admixture proportions \(\boldsymbol{Q}\), we can use the function est_coanc to estimate population coancestry under the super admixture and standard admixture.
load("rdata/coanc_os.rda")
load("rdata/Q_hat/11.rda")
coanc_antepops_sup <- est_coanc(coanc_os, Q_hat, model = "super")
coanc_antepops_std <- est_coanc(coanc_os, Q_hat, model = "standard")We then compute the corresponding individual-level coancestry under the super admixture and standard admixture.
6 Visualization
We can visualize the coancestry of antecedent populations coanc_antepops_sup and admixture proportions Q_hat using the helper functions get_seq_colors, plot_tree, barplot_admix and heatmap_coanc_antepops we provide.
# reorder antecedent populations according to the value of the population inbredding
index <- order(diag(coanc_antepops_sup))
Q_hat <- Q_hat[index, ]
k_antepops <- 11
coanc_antepops_sup <- coanc_antepops_sup[index, index]
# label antecedent populations
colnames(coanc_antepops_sup) <- rownames(coanc_antepops_sup) <- paste0("S", 1:k_antepops)
# fit tree
tree <- bnpsd::fit_tree(round(coanc_antepops_sup, 6))
# plot an uncolorred tree
plot_tree(tree)Based on the topology of the tree, we decide to color the populations \(S_1\), \(S_2\) by light blue and dark blue, \(S_3\), \(S_4\), \(S_5\) by light green, medium green and dark green, \(S_9\) by purple and the rest by a sequence of red colors. We can pick colors by using the get_seq_colors() function and its returned value can be used to specify the coloring scheme for plot_tree() function.
colors <- c(get_seq_colors("Blues", 2), get_seq_colors("Greens", 3), get_seq_colors("Reds", 5), get_seq_colors("Purples", 1))
names(colors) <- paste0("S", c(1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 9))
fig_tree <- plot_tree(tree, colors = colors, font_size = 16)
# flip tree
fig_tree <- ggtree::flip(fig_tree,
get_current_node("S9", tree),
get_parent_node("S6", tree))
fig_tree <- ggtree::flip(fig_tree,
get_current_node("S5", tree),
get_current_node("S3", tree))
fig_tree <- ggtree::scaleClade(fig_tree, get_parent_node("S4", tree), 2)
fig_tree <- ggtree::scaleClade(fig_tree, get_parent_node("S6", tree), 0.8)
fig_treeWe can visualize admixture proportions Q_hat using the barplot_admix function.
We also can visualize the coancestry among antecedent populations by heatmap_coanc_antepops. We define the helper function heatmap_coanc_antepops_wrapper that returns a ggplot object of the heatmap.
heatmap_coanc_antepops_wrapper <- function(coanc_antepops, tl.cex = 1.3, cl.cex = 1, tl.offset = 0.6) {
superadmixture::heatmap_coanc_antepops(coanc_antepops, tl.cex = tl.cex, cl.cex = cl.cex, tl.offset = tl.offset)
grab_grob <- function(){
gridGraphics::grid.echo()
grid::grid.grab()
}
p <- grab_grob()
# save correlation matrix colors to a vector, then make coloured matrix grob transparent
matrix.colors <- grid::getGrob(p, grid::gPath("square"), grep = TRUE)[["gp"]][["fill"]]
p <- grid::editGrob(p, grid::gPath("square"), grep = TRUE, gp = grid::gpar(col = NA, fill = NA))
# apply the saved colours to the underlying matrix grob
p <- grid::editGrob(p, grid::gPath("symbols-rect-1"), grep = TRUE, gp = grid::gpar(fill = matrix.colors))
# convert the background fill from white to transparent, while we are at it
p <- grid::editGrob(p, grid::gPath("background"), grep = TRUE, gp = grid::gpar(fill = NA))
p <- ggplotify::as.ggplot(p) + ggplot2::theme(plot.margin = unit(c(0, 0, 0, 0), "pt"))
return(p)
}par(xpd = TRUE)
fig_coancestry <- heatmap_coanc_antepops_wrapper(coanc_antepops_sup, tl.cex = 0.9, tl.offset = 1)fig_coancestry <- fig_coancestry + theme(plot.margin = margin(0, 0, 0, 0, "pt"))
coancestry_lab <- ggplot() +
annotate("text", x = 0.5, y = 0.5, size = 5, label = "Coancestry") +
xlim(0, 1) +
ylim(0, 1) +
theme_void()
fig_coancestry <- ggarrange(fig_coancestry, coancestry_lab, ncol = 1, heights = c(1, 0.1))We combine all plots.
# combine plots of tree, admix props, subpops and coancestry
design <- "
112
113
##3
"
fig_coancestry + fig_tree + fig_admix +
patchwork::plot_layout(
design = design,
widths = c(0.7, 0.2, 1.4),
heights = c(0.7, 0.2, 0.4)) +
patchwork::plot_annotation(tag_levels = 'A', tag_prefix = '(', tag_suffix = ')') &
ggplot2::theme(plot.tag = ggplot2::element_text(color = "black", size = 21, face = 'bold'))7 Simulating genotypes from the structure of Human Origins dataset
We can simulate genotypes using the double-admixture method with the dbl_admixture function. It takes about an hour to run the following codes, so we’re not running here.
X <- as.matrix(X)
p_anc <- 0.5 * colMeans(X, na.rm = TRUE)
coanc_antepops <- round(coanc_antepops, 6)
X_sim <- dbl_admixture(p_anc, coanc_antepops, Q_hat, geno_only = TRUE)
## estimate kinship
obj <- popkin::popkin_A(X_sim)
A_min <- mean(sort(obj$A[lower.tri(obj$A)])[1:100])
kinship_sim_os <- 1 - obj$A / A_min
## mapping kinship coefficients to coancestry coefficients
coanc_sim_os <- popkin::inbr_diag(kinship_sim_os)
coanc_sim_os <- ifelse(coanc_sim_os < 0, 0, coanc_sim_os)
save(coanc_sim_os, file = "rdata/coanc_sim_os.rda")We can also use NORTA method to simulate genotypes. It takes a few hours to run the following codes, so we’re not running here.
X_sim_norta <- norta_approx(p_anc, coanc_antepops, Q_hat,
geno_only = TRUE, parallel = TRUE, method = "numeric", tol = 1e-5, mc_cores = 20)
## estimate kinship
obj <- popkin::popkin_A(X_sim_norta)
A_min <- mean(sort(obj$A[lower.tri(obj$A)])[1:100])
kinship_sim_os_norta <- 1 - obj$A / A_min
## mapping kinship coefficients to coancestry coefficients
coanc_sim_os_norta <- popkin::inbr_diag(kinship_sim_os_norta)
coanc_sim_os_norta <- ifelse(coanc_sim_os_norta < 0, 0, coanc_sim_os_norta)
save(coanc_sim_os_norta, file = "rdata/coanc_sim_os_norta.rda")We plot the individual-level coancestry.
load("rdata/coanc_sim_os.rda")
load("rdata/coanc_sim_os_norta.rda")
par(mar = c(0, 0, 0, 0) + 0.2)
layout(rbind(
c(14, 19, 15, 20, 16, 21, 0),
c( 7, 1, 0, 2, 0, 3, 6),
c( 0, 9, 0, 10, 0, 11, 0),
c(17, 22, 18, 23, 0, 0, 0),
c( 8, 4, 0, 5, 0, 0, 0),
c( 0, 12, 0, 13, 0, 0, 0)),
heights = c(0.2, 1, 0.22, 0.2, 1, 0.22),
widths = c(0.2, 1, 0.08, 1, 0.08, 1, 0.2))
coanc_sim_os <- (coanc_sim_os - 1) * (1 - min(coanc_sup)) + 1
coanc_sim_os_norta <- (coanc_sim_os_norta - 1) * (1 - min(coanc_sup)) + 1
# We truncate the large entries of `coanc_std`
# coanc_std <- ifelse(coanc_std > max(coanc_os), max(coanc_os), coanc_std)
popkin::plot_popkin(kinship = list(coanc_os, coanc_sup, coanc_std, coanc_sim_os, coanc_sim_os_norta),
layout_add = FALSE,
leg_cex = 0.8,
labs_text = FALSE,
labs_lwd = 0.1,
labs = fam$subpop,
ylab = '',
leg_title = "Coancestry",
panel_letters = NULL)
par(mar = c(0.5, 0.5, 0.2, 0))
plot_colors_subpops(fam$subpop, y = TRUE, cex = 1.1)
mtext('Individuals', side = 2, line = 0.5, xpd = NA, cex = 0.8)
par(mar = c(0.5, 0.5, 0.2, 0))
plot_colors_subpops(fam$subpop, y = TRUE, cex = 1.1)
mtext('Individuals', side = 2, line = 0.5, xpd = NA, cex = 0.8)
par(mar = c(0, 0.5, 0, 0.2))
plot_colors_subpops(fam$subpop, srt = 90, cex = 1.1)
par(mar = c(0, 0.5, 0, 0.2))
plot_colors_subpops(fam$subpop, srt = 90, cex = 1.1)
par(mar = c(0, 0.5, 0, 0.2))
plot_colors_subpops(fam$subpop, srt = 90, cex = 1.1)
par(mar = c(0, 0.5, 0, 0.2))
plot_colors_subpops(fam$subpop, srt = 90, cex = 1.1)
par(mar = c(0, 0.5, 0, 0.2))
plot_colors_subpops(fam$subpop, srt = 90, cex = 1.1)
par(mar = c(0, 0, 0, 0))
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.3, "(A)", cex = 2, xpd = TRUE, font = 2)
par(mar = c(0, 0, 0, 0))
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.3, "(B)", cex = 2, xpd = TRUE, font = 2)
par(mar = c(0, 0, 0, 0))
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.3, "(C)", cex = 2, xpd = TRUE, font = 2)
par(mar = c(0, 0, 0, 0))
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.3, "(D)", cex = 2, xpd = TRUE, font = 2)
par(mar = c(0, 0, 0, 0))
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.3, "(E)", cex = 2, xpd = TRUE, font = 2)
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.4, "OS individual coancestry\nof real data", cex = 1.6)
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.4, "Super admixture individual coancestry\n of real data", cex = 1.6)
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.4, "Standard admixture individual coancestry\n of real data", cex = 1.6)
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.4, "OS individual coancestry\nof bootstrap data (double-admixture)", cex = 1.6)
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.4, "OS individual coancestry\nof bootstrap data (NORTA)", cex = 1.6)8 Confirming significant hypothesis tests of standard admixture versus super admixture
We perform the significance test of coancestry among antecedent populations to compare the model fit between standard admixture versus super admixture. The following chunk of codes is used to generate the null distribution of the test statistics \(U\). It should be noted that it takes days to run this chunk of code. We submit each iteration as a separate cluster job to speed up (not show here). Therefore, we’re just showing the code but not running it. We provide the pre-computed null test statistics in the file rdata/test_stats0.rda.
for (task_id in 1:1000) {
## compute null test statistics
inbr_antepops <- diag(est_coanc(coanc_os, Q_hat, model = "standard"))
## compute null
test_stats0 <- compute_null(p_anc, Q_hat, inbr_antepops, verbose = TRUE)
# save results
dir.create(file.path("rdata", "test_stat0"), showWarnings = FALSE)
save(test_stat0, file = paste0("rdata/test_stat0/", task_id, ".rda"))
}
test_stats0 <- c()
for (task_id in 1:1000) {
load(paste0("rdata/test_stat0/", task_id, ".rda"))
test_stats0 <- c(test_stats0, test_stat0)
}
save(test_stats0, file = "rdata/test_stats0.rda")We plot the distribution of null test statistics and label our observed test statistics.
load("rdata/test_stats0.rda")
load("rdata/coanc_os.rda")
load("rdata/Q_hat/11.rda")
coanc_antepops_sup <- est_coanc(coanc_os, Q_hat, model = "super")
coanc_antepops_std <- est_coanc(coanc_os, Q_hat, model = "standard")
test_stat1 <- norm(coanc_antepops_sup - coanc_antepops_std, "F")
p <- data.frame(test_stats0 = test_stats0) %>%
ggplot(data, mapping = aes(x = test_stats0)) +
geom_histogram(aes(y =..density..),
fill = "dodgerblue3",
bins = 10,
binwidth = NULL,
color = "black",
alpha = 0.7,
size = 0.1) +
labs(x = latex2exp::TeX(r"(Null test-statistics)"), y='Density') +
scale_y_continuous(limits = c(0, 250)) +
theme_bw(base_size = 16) +
ggtitle("Human Origins (HO)") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5, size=16)) +
annotate("text", x = 0.068, y = 240, size = 6, label = latex2exp::TeX(sprintf("$U_{obs}$ = %.3f", test_stat1)))
p